import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
import matplotlib.pyplot as plt
import scipy.stats as stats
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import seaborn as sns
!pip install umap-learn
!pip install projection-pursuit
import umap
from skpp import ProjectionPursuitRegressor
#When saving to HTML uncomment the line below
pio.renderers.default = "notebook" #"colab" is default
data = pd.read_csv("./heart.csv")
Checking data consistency:
data.isna().any()
Mode - represents the most frequent value. Since it's not included in Pandas' discribe, we had to call it separately. It returns multiple rows if there are multiple modes available, hence we only took the first row.
numerical_data = data[["age", "cp", "trestbps", "chol", "fbs", "thalach"]]
numerical_data.mode().iloc[0]
Describe offers descriptive statistics: the mean, median (here represented by 50%), range and standard deviation
numerical_data.describe()
Skewness measures the asymmetry of a distribution:
From the dataset, fbs has the highest positive value, while sex has the highest negative value.
The variables sex is really skewed to the right, and since it is categorical it means there are more males than females in the dataset. The variable age is really close to 0, which implies that it almost follows the normal distribution. For the rest of the variables, the skewness is more visible in the density plot.
data.skew()
Kurtosis measures the peakedness of a distribution:
From the dataset variables, chol has heavy outliers, while thalach lacks outliers
The variable chol, which is the level of cholesterol, has heavy outliers. The variables thalach, thal, and age dont have that many outliers.
data.kurtosis()
We can see that indeed the sex is heavily disproportionate, aswell as fbs, exang. Some contain even some sort of outliers like restecg, ca, thal.
categorical_variables = ["sex", "fbs", "restecg", "exang", "cp", "slope", "ca", "thal"]
for i in categorical_variables:
print(data[i].value_counts())
Here we can clearly see how skewness and kurtosis were determined
nrow = 7
ncol = 2
fig, axes = plt.subplots(nrow, ncol, constrained_layout=True)
count = 0
for r in range(nrow):
for c in range(ncol):
title = 'Attribute "' + data.columns[count] + '" density plot'
data.iloc[:, count].plot.density(ax=axes[r,c], title=title, figsize=(10,20))
count += 1
Shows how the attributes' values are distributed
nrow = 7
ncol = 2
fig, axes = plt.subplots(nrow, ncol, constrained_layout=True)
count = 0
for r in range(nrow):
for c in range(ncol):
title = 'Attribute "' + data.columns[count] + '" frequency plot'
data.iloc[:, count].plot.hist(ax=axes[r,c], title=title, figsize=(10,20))
count += 1
The boxplot shows the minimum, first quartile (25% of the data), median (50% of the data), third quartile (75% of the data) and the maximum. The values outside of $[minimum, maximum]$ are plotted as points and are to be considered outliers.
nrow = 7
ncol = 2
fig, axes = plt.subplots(nrow, ncol, constrained_layout=True)
count = 0
for r in range(nrow):
for c in range(ncol):
title = 'Attribute "' + data.columns[count] + '" boxplot'
data.iloc[:, count].plot.box(ax=axes[r,c], title=title, figsize=(10,20))
count += 1
numerical_data.corr(method="pearson").style.background_gradient(cmap='Blues')
numerical_data.corr(method="spearman").style.background_gradient(cmap='Blues')
numerical_data.corr(method="kendall").style.background_gradient(cmap='Blues')
dof = 0.05
categorical_variables = ["sex", "fbs", "restecg", "exang", "cp", "slope", "ca", "thal", "target"]
semnificative = []
for i in range(len(categorical_variables)):
for j in range(i + 1, len(categorical_variables)):
ct = pd.crosstab(data[categorical_variables[i]], data[categorical_variables[j]], margins=True)
values = [ct[i][0:len(ct.index) - 1].values for i in range(len(ct.columns) - 1)]
chi2, p, _, _ = stats.chi2_contingency(values)
print(f"Independence between {categorical_variables[i]:<7} and {categorical_variables[j]:<7}. Chi-squared test: ")
print(f"chi2: {chi2:.3f}, p-value: {p:.3f}")
print(f"{'Failed, p-value bigger than ' + str(dof) if p > dof else 'Relevant'}\n")
if (p < 0.0001):
semnificative.append([categorical_variables[i], categorical_variables[j]])
print("Semnificative pairs")
for a, b in semnificative:
print(a, b)
dof = 0.05
binary_variables = ["sex", "fbs", "exang", "target"]
semnificative = []
for i in range(len(binary_variables)):
for j in range(i + 1, len(binary_variables)):
ct = pd.crosstab(data[binary_variables[i]], data[binary_variables[j]], margins=True)
values = [ct[i][0:len(ct.index) - 1].values for i in range(len(ct.columns) - 1)]
ratio, p2 = stats.fisher_exact(values)
print(f"Independence between {binary_variables[i]:<7} and {binary_variables[j]:<7}. Fisher's exact test:")
print(f"ratio: {ratio:.3f}, p-value: {p2:.3f}")
print(f"{'Failed, p-value bigger than ' + str(dof) if p2 > dof else 'Relevant'}\n")
if (p2 < 0.0001):
semnificative.append([binary_variables[i], binary_variables[j]])
print("Semnificative pairs")
for a, b in semnificative:
print(a, b)
dof = 0.05
binary_variables = ["sex", "fbs", "exang", "target"]
semnificative = []
for column in numerical_data.columns:
for attribute in binary_variables:
if column != attribute:
group_1 = data[column][data[attribute] == 1]
group_2 = data[column][data[attribute] == 0]
t, p = stats.ttest_ind(group_1, group_2)
print(f"Variable {column:<8} separated by {attribute:<6} has t-value: {t:.3f} and p-value: {p:.3f}.")
print(f"{'Failed, p-value bigger than ' + str(dof) if p > dof else 'Relevant'}\n")
if (p < 0.0001):
semnificative.append([column, attribute])
print("Semnificative pairs")
for a, b in semnificative:
print(a, b)
If two variables are correlated we should see a relateness between the points.
fig = go.Figure()
axis_labels = []
count = 0
for i in range(len(data.columns)):
for j in range(i + 1, len(data.columns)):
count += 1
fig2 = px.scatter(data, x=data.iloc[:, i],
y=data.iloc[:, j],
color="target")
fig.add_trace(fig2.data[0])
axis_labels.append([data.columns[i], data.columns[j]])
fig.data[count - 1].visible = True
steps = []
for i in range(len(fig.data)):
step = dict(
method="update",
args=[{"visible": [False] * len(fig.data)},
{"title": "Slider switched to step: " + str(i),
"xaxis.title": axis_labels[i][0],
"yaxis.title": axis_labels[i][1]
}],
)
step["args"][0]["visible"][i] = True
steps.append(step)
sliders = [dict(
active=count - 1,
currentvalue={"prefix": "Frequency: "},
pad={"t": 50},
steps=steps
)]
fig.update_layout(
sliders=sliders,
plot_bgcolor='rgba(100,100,100,2)'
)
fig.show()
variables = ["age", "trestbps", "chol", "thalach", "oldpeak"]
axis_labels = []
fig = go.Figure()
count = 0
for i in range(len(variables)):
for j in range(i + 1, len(variables)):
for k in range(j + 1, len(variables)):
count += 1
fig2 = px.scatter_3d(data, x=variables[i],
y=variables[j],
z=variables[k],
color="target")
fig.add_trace(fig2.data[0])
axis_labels.append([variables[i], variables[j], variables[k]])
fig.data[count - 1].visible = True
steps = []
for i in range(len(fig.data)):
step = dict(
method="update",
args=[{"visible": [False] * len(fig.data)},
{"title": "Slider switched to step: " + str(i),
"scene.xaxis.title": axis_labels[i][0],
"scene.yaxis.title": axis_labels[i][1],
"scene.zaxis.title": axis_labels[i][2],
}],
)
step["args"][0]["visible"][i] = True
steps.append(step)
sliders = [dict(
active=count - 1,
currentvalue={"prefix": "Frequency: "},
pad={"t": 50},
steps=steps
)]
fig.update_layout(
sliders=sliders
)
fig.show()
normalized_data = data.copy()
normalized_data = normalized_data.drop(columns=["target"])
normalized_data = (normalized_data - normalized_data.mean()) / normalized_data.std()
pca = PCA(n_components=normalized_data.shape[1])
pca.fit(normalized_data)
%matplotlib inline
plt.plot(pca.explained_variance_ratio_)
plt.ylabel('Explained Variance')
plt.xlabel('Components')
plt.xticks([i for i in range(len(pca.explained_variance_ratio_))])
plt.show()
pca3 = PCA(n_components=3)
new_data_3 = pca3.fit_transform(normalized_data)
new_data_3 = pd.DataFrame(data = new_data_3, columns = ['PCA 1', 'PCA 2', 'PCA 3'])
print(f'Explained variation per principal component: {pca3.explained_variance_ratio_}. Total: {sum(pca3.explained_variance_ratio_)}')
pca3_by_variables = pd.DataFrame(pca3.components_, columns = data.drop(columns="target").columns)
pca3_by_variables.T.index
pca5 = PCA(n_components=5)
new_data_5 = pca5.fit_transform(normalized_data)
new_data_5 = pd.DataFrame(data = new_data_5, columns = ['PCA 1', 'PCA 2', 'PCA 3', 'PCA 4', 'PCA 5'])
print(f'Explained variation per principal component: {pca5.explained_variance_ratio_}. Total: {sum(pca5.explained_variance_ratio_)}')
pca_by_variables = pd.DataFrame(pca5.components_, columns = data.drop(columns="target").columns)
pca_by_variables
new_data_3["target"] = data["target"]
fig = px.scatter_3d(new_data_3, x=new_data_3.columns[0], y=new_data_3.columns[1], z=new_data_3.columns[2], color='target')
for attribute in pca3_by_variables.T.index:
x = np.array([0, pca3_by_variables[attribute][0]]) * 10
y = np.array([0, pca3_by_variables[attribute][1]]) * 10
z = np.array([0, pca3_by_variables[attribute][2]]) * 10
fig.add_trace(go.Scatter3d(x=x, y=y,z=z, mode='lines', name=attribute))
fig.update_layout(legend=dict(
orientation="h",
yanchor="bottom",
y=1.02,
xanchor="right",
x=1
))
fig.show()
Non-linear mappings transforms the data from a multi-dimensional space to a bi-dimensional one.
def sammon(x, n, display = 2, inputdist = 'raw', maxhalves = 20, maxiter = 500, tolfun = 1e-9, init = 'default'):
import numpy as np
from scipy.spatial.distance import cdist
if inputdist == 'distance':
D = x
if init == 'default':
init = 'cmdscale'
else:
D = cdist(x, x)
if init == 'default':
init = 'pca'
if inputdist == 'distance' and init == 'pca':
raise ValueError("Cannot use init == 'pca' when inputdist == 'distance'")
if np.count_nonzero(np.diagonal(D)) > 0:
raise ValueError("The diagonal of the dissimilarity matrix must be zero")
N = x.shape[0]
scale = 0.5 / D.sum()
D = D + np.eye(N)
print(np.count_nonzero(D<=0))
if np.count_nonzero(D<=0) > 0:
raise ValueError("Off-diagonal dissimilarities must be strictly positive")
Dinv = 1 / D
if init == 'pca':
[UU,DD,_] = np.linalg.svd(x)
y = UU[:,:n]*DD[:n]
else:
y = np.random.normal(0.0,1.0,[N,n])
one = np.ones([N,n])
d = cdist(y,y) + np.eye(N)
dinv = 1. / d
delta = D-d
E = ((delta**2)*Dinv).sum()
for i in range(maxiter):
delta = dinv - Dinv
deltaone = np.dot(delta,one)
g = np.dot(delta,y) - (y * deltaone)
dinv3 = dinv ** 3
y2 = y ** 2
H = np.dot(dinv3,y2) - deltaone - np.dot(2,y) * np.dot(dinv3,y) + y2 * np.dot(dinv3,one)
s = -g.flatten(order='F') / np.abs(H.flatten(order='F'))
y_old = y
for j in range(maxhalves):
s_reshape = np.reshape(s, (-1,n),order='F')
y = y_old + s_reshape
d = cdist(y, y) + np.eye(N)
dinv = 1 / d
delta = D - d
E_new = ((delta**2)*Dinv).sum()
if E_new < E:
break
else:
s = 0.5*s
if j == maxhalves-1:
print('Warning: maxhalves exceeded. Sammon mapping may not converge...')
if abs((E - E_new) / E) < tolfun:
if display:
print('TolFun exceeded: Optimisation terminated')
break
E = E_new
if display > 1:
print('epoch = %d : E = %12.10f'% (i+1, E * scale))
if i == maxiter-1:
print('Warning: maxiter exceeded. Sammon mapping may not have converged...')
E = E * scale
return [y,E]
data_matrix = data.to_numpy()[:, :-1]
x, index = np.unique(data_matrix, axis=0, return_index=True)
target = data.to_numpy()[:, -1]
target = target[index]
y, E = sammon(x, 2)
fig = go.Figure()
fig.add_trace(go.Scatter(x=y[target==0, 0], y=y[target==0, 1],
mode='markers',
name='Negative Heart Disease'))
fig.add_trace(go.Scatter(x=y[target==1, 0], y=y[target==1, 1],
mode='markers',
name='Positive Heart Disease'))
fig.update_layout(title='Sammon Mapping')
fig.show()
y = TSNE(n_components=2, learning_rate='auto', init="pca").fit_transform(x)
fig = go.Figure()
fig.add_trace(go.Scatter(x=y[target==0, 0], y=y[target==0, 1],
mode='markers',
name='Negative Heart Disease'))
fig.add_trace(go.Scatter(x=y[target==1, 0], y=y[target==1, 1],
mode='markers',
name='Positive Heart Disease'))
fig.update_layout(title='t-SNE Mapping')
fig.show()
# Has learning_rate, different distance metrics, spread, min_dist
y = umap.UMAP().fit_transform(x)
fig = go.Figure()
fig.add_trace(go.Scatter(x=y[target==0, 0], y=y[target==0, 1],
mode='markers',
name='Negative Heart Disease'))
fig.add_trace(go.Scatter(x=y[target==1, 0], y=y[target==1, 1],
mode='markers',
name='Positive Heart Disease'))
fig.update_layout(title='uMap Mapping')
fig.show()
Available in the other document
Plotting every numerical variable according to a categorical one as a boxplot. If the boxplots notches are not overlapping that means that there is a statistically significant difference between the medians.
numerical_variables = ["age", "trestbps", "chol", "thalach", "oldpeak"]
categorical_variables = ["sex", "fbs", "restecg", "exang", "cp", "slope", "ca", "thal", "target"]
axis_labels = []
fig = go.Figure()
count = 0
for nv in numerical_variables:
for cv in categorical_variables:
count += 1
axis_labels.append([cv, nv])
fig.add_trace(go.Box(x=data[cv], y=data[nv]))
fig.data[count - 1].visible = True
steps = []
for i in range(len(fig.data)):
step = dict(
method="update",
args=[{"visible": [False] * len(fig.data)},
{"title": "Slider switched to step: " + str(i),
"xaxis.title": axis_labels[i][0],
"yaxis.title": axis_labels[i][1]
}],
)
step["args"][0]["visible"][i] = True
steps.append(step)
sliders = [dict(
active=count - 1,
currentvalue={"prefix": "Frequency: "},
pad={"t": 50},
steps=steps
)]
fig.update_layout(
sliders=sliders
)
fig.show()
We can see that:
Allows us to visualize and compare the decompositions of the numeric value superimposed on each other.
numerical_variables = ["age", "trestbps", "chol", "thalach", "oldpeak"]
categorical_variables = ["sex", "fbs", "restecg", "exang", "cp", "slope", "ca", "thal", "target"]
axis_labels = []
show_traces = []
fig = go.Figure()
count = 0
for nv in numerical_variables:
for cv in categorical_variables:
fig2 = px.histogram(data, x=nv, color=cv)
traces_to_show = []
for i in range(len(fig2.data)):
traces_to_show.append(count)
count += 1
histogram = fig2.data[i]
histogram.name = cv + " " + str(histogram.name)
histogram.opacity = 0.8
fig.add_trace(histogram)
show_traces.append(traces_to_show)
axis_labels.append([nv, cv])
steps = []
for i in range(len(show_traces)):
step = dict(
method="update",
args=[{"visible": [False] * len(fig.data)},
{"title": "Showing in relation to " + axis_labels[i][1],
"xaxis.title": axis_labels[i][0],
"yaxis.title": "count"
}],
)
for j in show_traces[i]:
step["args"][0]["visible"][j] = True
steps.append(step)
sliders = [dict(
active=count - 1,
currentvalue={"prefix": "Frequency: "},
pad={"t": 50},
steps=steps
)]
fig.update_layout(
sliders=sliders,
barmode="overlay"
)
fig.show()
Used to determine the relative decomposition of an attribute according to a categorical variable. Thus, we can see the contribution of each secondary group in overall value.
numerical_variables = ["age", "trestbps", "chol", "thalach", "oldpeak"]
categorical_variables = ["sex", "fbs", "restecg", "exang", "cp", "slope", "ca", "thal", "target"]
axis_labels = []
show_traces = []
fig = go.Figure()
count = 0
for nv in numerical_variables:
for cv in categorical_variables:
fig2 = px.histogram(data, x=nv, color=cv)
traces_to_show = []
for i in range(len(fig2.data)):
traces_to_show.append(count)
count += 1
histogram = fig2.data[i]
histogram.name = cv + " " + str(histogram.name)
histogram.opacity = 0.8
fig.add_trace(histogram)
show_traces.append(traces_to_show)
axis_labels.append([nv, cv])
steps = []
for i in range(len(show_traces)):
step = dict(
method="update",
args=[{"visible": [False] * len(fig.data)},
{"title": "Showing in relation to " + axis_labels[i][1],
"xaxis.title": axis_labels[i][0],
"yaxis.title": "count"
}],
)
for j in show_traces[i]:
step["args"][0]["visible"][j] = True
steps.append(step)
sliders = [dict(
active=count - 1,
currentvalue={"prefix": "Frequency: "},
pad={"t": 50},
steps=steps
)]
fig.update_layout(
sliders=sliders,
barmode="stack"
)
fig.show()
Helps us visualize how dependent an attribute is to the target.
axis_labels = []
fig = go.Figure()
count = 0
for pred in data.columns[:-1]:
count += 1
axis_labels.append(["target", pred])
fig.add_trace(go.Box(x=data["target"], y=data[pred]))
fig.data[count - 1].visible = True
steps = []
for i in range(len(fig.data)):
step = dict(
method="update",
args=[{"visible": [False] * len(fig.data)},
{"title": "Slider switched to step: " + str(i),
"xaxis.title": axis_labels[i][0],
"yaxis.title": axis_labels[i][1]
}],
)
step["args"][0]["visible"][i] = True
steps.append(step)
sliders = [dict(
active=count - 1,
currentvalue={"prefix": "Frequency: "},
pad={"t": 50},
steps=steps
)]
fig.update_layout(
sliders=sliders
)
fig.show()
data.corr(method="kendall")[-1:].style.background_gradient(cmap='Blues')
corr_data = data.corr(method="kendall")[-1:].to_numpy()[0,:-1]
fig = go.Figure()
fig.add_trace(go.Bar(x=data.columns[:-1], y=corr_data))
fig.update_layout(title="Correlation barplot")
fig.show()
dof = 0.05
categorical_variables = ["sex", "fbs", "restecg", "exang", "cp", "slope", "ca", "thal"]
for i in range(len(categorical_variables)):
ct = pd.crosstab(data[categorical_variables[i]], data["target"], margins=True)
values = [ct[i][0:len(ct.index) - 1].values for i in range(len(ct.columns) - 1)]
chi2, p, _, _ = stats.chi2_contingency(values)
print(f"Independence between {categorical_variables[i]:<7} and target. Chi-squared test: ")
print(f"chi2: {chi2:.3f}, p-value: {p:.3f}")
print(f"{'Failed, p-value bigger than ' + str(dof) if p > dof else 'Relevant'}\n")
dof = 0.05
binary_variables = ["sex", "fbs", "exang"]
for i in range(len(binary_variables)):
ct = pd.crosstab(data[binary_variables[i]], data["target"], margins=True)
values = [ct[i][0:len(ct.index) - 1].values for i in range(len(ct.columns) - 1)]
ratio, p2 = stats.fisher_exact(values)
print(f"Independence between {binary_variables[i]:<7} and target. Fisher's exact test:")
print(f"ratio: {ratio:.3f}, p-value: {p2:.3f}")
print(f"{'Failed, p-value bigger than ' + str(dof) if p2 > dof else 'Relevant'}\n")
binary_variables = ["sex", "fbs", "exang"]
for column in binary_variables:
group_1 = data["target"][data[column] == 1]
group_2 = data["target"][data[column] == 0]
t, p = stats.ttest_ind(group_1, group_2)
print(f"Variable target separated by {column:<5} has t-value: {t:.3f} and p-value: {p:.3f}.")
print(f"{'Failed, p-value bigger than 0.05' if p > 0.05 else 'Relevant'}\n")